{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# python 基础\n",
    "\n",
    "## 字典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "map_abc = {\n",
    "    \"a\" : 0,\n",
    "    \"b\" : [1,2,3],\n",
    "    \"c\" : {\"cc\" : 4}\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(map_abc[\"a\"])\n",
    "print(map_abc[\"b\"], map_abc[\"b\"][1])\n",
    "print(map_abc[\"c\"][\"cc\"])\n",
    "for k in map_abc:\n",
    "    print(\"key:\",k, \"\\nvalue:\", map_abc[k])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## numpy 处理缺失数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "\n",
    "np_temp = [36.2, 36.3, 36.4, np.nan, 36.3, 36.2, 36.4]\n",
    "print(np.mean(np_temp))\n",
    "print(np.nanmean(np_temp))\n",
    "np_temp[3] = np.nanmean(np_temp)\n",
    "print(np.mean(np_temp))\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pandas one hot 编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df_onehot_example = pd.DataFrame({\"name\" : [u\"张三\", u\"李四\", u\"王五\", u\"李武\"], \n",
    "                                  \"job\" : [u\"工人\", u\"农民\", u\"军人\", u\"工人\"]})\n",
    "\n",
    "pd.get_dummies(df_onehot_example, columns=[\"job\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## matplotlib 作图与过拟合欠拟合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# http://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "\n",
    "def true_fun(X):\n",
    "    return np.cos(1.5 * np.pi * X)\n",
    "\n",
    "np.random.seed(42)\n",
    "\n",
    "n_samples = 30\n",
    "degrees = [1, 4, 15]\n",
    "\n",
    "X = np.sort(np.random.rand(n_samples))\n",
    "y = true_fun(X) + np.random.randn(n_samples) * 0.1\n",
    "\n",
    "plt.figure(figsize=(14, 5))\n",
    "for i in range(len(degrees)):\n",
    "    ax = plt.subplot(1, len(degrees), i + 1)\n",
    "    plt.setp(ax, xticks=(), yticks=())\n",
    "\n",
    "    polynomial_features = PolynomialFeatures(degree=degrees[i],\n",
    "                                             include_bias=False)\n",
    "    linear_regression = LinearRegression()\n",
    "    pipeline = Pipeline([(\"polynomial_features\", polynomial_features),\n",
    "                         (\"linear_regression\", linear_regression)])\n",
    "    pipeline.fit(X[:, np.newaxis], y)\n",
    "\n",
    "    # Evaluate the models using crossvalidation\n",
    "    scores = cross_val_score(pipeline, X[:, np.newaxis], y,\n",
    "                             scoring=\"neg_mean_squared_error\", cv=10)\n",
    "\n",
    "    X_test = np.linspace(0, 1, 100)\n",
    "    plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label=\"Model\")\n",
    "    plt.plot(X_test, true_fun(X_test), label=\"True function\")\n",
    "    plt.scatter(X, y, edgecolor='b', s=20, label=\"Samples\")\n",
    "    plt.xlabel(\"x\")\n",
    "    plt.ylabel(\"y\")\n",
    "    plt.xlim((0, 1))\n",
    "    plt.ylim((-2, 2))\n",
    "    plt.legend(loc=\"best\")\n",
    "    plt.title(\"Degree {}\\nMSE = {:.2e}(+/- {:.2e})\".format(\n",
    "        degrees[i], -scores.mean(), scores.std()))\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 鸢尾花数据集\n",
    "\n",
    "## 预览数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import pandas as pd\n",
    "from sklearn import datasets\n",
    "\n",
    "%matplotlib inline\n",
    "sns.set_style(\"white\")\n",
    "\n",
    "data = datasets.load_iris()\n",
    "for k in data:\n",
    "    print(\"############\\n##%s##\\n############\\n\" % k)\n",
    "    print(data[k])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame(data.data)\n",
    "df.columns = data.feature_names\n",
    "df['species'] = [ data['target_names'][x] for x in data.target ]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cnt = df['species'].value_counts().reset_index()\n",
    "df_cnt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.barplot(data=df_cnt, x='index', y='species')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 均值方差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "pd.melt(df, id_vars=['species'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 正态分布检验"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats\n",
    "\n",
    "for i in range(4):\n",
    "    name = data.feature_names[i]\n",
    "    ax = plt.subplot(2,2,i+1)\n",
    "    stats.probplot(df[name], plot=ax)\n",
    "    ax.set_title(name)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 分组观察"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.melt(df, id_vars=['species']).pivot_table(index=['species'], columns=['variable'], aggfunc=[np.mean, np.var])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.pairplot(df, hue=\"species\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(12,4))\n",
    "\n",
    "for i in range(3):\n",
    "    name = data.target_names[i]\n",
    "    ax = plt.subplot(1,3,i+1)\n",
    "    stats.probplot(df[df['species']==name][data.feature_names[2]], plot=ax)\n",
    "    ax.set_title(name)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 基于 PCA 数据降维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "pca = PCA()\n",
    "df_sub = df[data.feature_names[0:3]]\n",
    "pca.fit(df_sub)\n",
    "pca_result = pca.transform(df_sub)\n",
    "fig = plt.figure(figsize=(4,4))\n",
    "ax = fig.add_subplot(111)\n",
    "ax.scatter(pca_result[:, 0], pca_result[:, 1], c=data.target, cmap=plt.cm.Set3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "\n",
    "plane_show_size_ratio = 5\n",
    "plane_show_shift = df_sub.mean().values\n",
    "pca_score = pca.explained_variance_ratio_\n",
    "V = pca.components_\n",
    "l_pca_axis = V.T * plane_show_size_ratio\n",
    "l_pca_plane = []\n",
    "for pca_axis in l_pca_axis:\n",
    "    l_pca_plane.append(np.r_[pca_axis[:2], - pca_axis[1::-1]].reshape(2,2))\n",
    "\n",
    "fig = plt.figure(figsize=(4,4))\n",
    "ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=150, azim=-34 )\n",
    "ax.scatter(df_sub.values[:,0], df_sub.values[:,1],df_sub.values[:,2], '.', c=data.target, cmap=plt.cm.Set3)\n",
    "ax.plot_surface(l_pca_plane[0]+plane_show_shift[0],\n",
    "                l_pca_plane[1]+plane_show_shift[1],\n",
    "                l_pca_plane[2]+plane_show_shift[2], alpha=0.1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 基于流形假设降维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.manifold import Isomap, MDS, SpectralEmbedding\n",
    "\n",
    "n_components = 2\n",
    "n_neighbors = 10\n",
    "X = df.drop(['species'], axis=1)\n",
    "color = data.target\n",
    "\n",
    "fig = plt.figure(figsize=(12, 4))\n",
    "\n",
    "\n",
    "Y = Isomap(n_neighbors, n_components).fit_transform(X)\n",
    "ax = fig.add_subplot(131)\n",
    "ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Set3)\n",
    "ax.set_title(\"Isomap\")\n",
    "\n",
    "Y = MDS(n_components, max_iter=100, n_init=1).fit_transform(X)\n",
    "ax = fig.add_subplot(132)\n",
    "ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Set3)\n",
    "ax.set_title(\"MDS\")\n",
    "\n",
    "Y = SpectralEmbedding(n_components=n_components,n_neighbors=n_neighbors).fit_transform(X)\n",
    "ax = fig.add_subplot(133)\n",
    "ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Set3)\n",
    "ax.set_title(\"Isomap\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## 数据预处理\n",
    "\n",
    "### 划分训练集验证集并进行标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler \n",
    "\n",
    "# df 作为初步训练集，划分 80% 作为训练集， 20% 作为验证集\n",
    "df_train, df_val = train_test_split(df, train_size=0.8, random_state=0)\n",
    "\n",
    "# 提前特征，这里是将分类结果舍弃\n",
    "X_train = df_train.drop(['species'], axis=1)\n",
    "X_val   = df_val.drop(['species'], axis=1)\n",
    "# 提取分类结果\n",
    "Y_train = df_train['species']\n",
    "Y_val   = df_val['species']\n",
    "\n",
    "# 设定分布 X_scaler，用训练集估计(fit)分布，然后对验证集进行转换(transform)\n",
    "X_scaler = StandardScaler()\n",
    "X_trainT = X_scaler.fit_transform(X_train)\n",
    "X_valT   = X_scaler.transform(X_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 这里将保证训练集是标准正态分布，验证集不一定满足这个条件，但不会差很多\n",
    "print(X_trainT.mean(axis=0), X_trainT.var(axis=0))\n",
    "print(X_valT.mean(axis=0), X_valT.var(axis=0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征挖掘\n",
    "\n",
    "## 观察特征后，基于数据性质手动组合重要特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import shuffle\n",
    "import matplotlib as mpl\n",
    "from cycler import cycler\n",
    "mpl.rcParams['axes.prop_cycle'] = cycler(color='rb')\n",
    "\n",
    "np.random.seed(42)\n",
    "pseudoNum1 = 300\n",
    "pseudoNum2 = 300\n",
    "np_pho1   = 4.5 + np.random.rand(pseudoNum1)*2\n",
    "np_pho2   = 0.5 + np.random.rand(pseudoNum2)*2\n",
    "np_theta1 = np.random.rand(pseudoNum1)*360 / 2*np.pi\n",
    "np_theta2 = np.random.rand(pseudoNum2)*360 / 2*np.pi\n",
    "\n",
    "np_x1 = np_pho1 * np.cos(np_theta1)\n",
    "np_y1 = np_pho1 * np.sin(np_theta1)\n",
    "np_x2 = np_pho2 * np.cos(np_theta2)\n",
    "np_y2 = np_pho2 * np.sin(np_theta2)\n",
    "\n",
    "pd_circ = shuffle(pd.DataFrame({\n",
    "    \"X\" : list(np_x1)+list(np_x2), \n",
    "    \"Y\" : list(np_y1)+list(np_y2), \n",
    "    \"label\" : [\"Class1\" for x in range(pseudoNum1)] + [\"Class2\" for x in range(pseudoNum2)]\n",
    "}), random_state=0).reset_index().drop(['index'],axis=1)\n",
    "pd_circ0 = pd_circ.copy()\n",
    "pd_circ.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for sub in [\"Class1\", \"Class2\"]:\n",
    "    pd_sub = pd_circ[pd_circ['label']==sub]\n",
    "    plt.plot(pd_sub[\"X\"], pd_sub[\"Y\"], \".\", label=sub)\n",
    "\n",
    "plt.legend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.pairplot(pd_circ, hue=\"label\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd_circ['X_add_Y'] = pd_circ['X'] + pd_circ['Y']\n",
    "pd_circ['X_time_Y'] = pd_circ['X'] * pd_circ['Y']\n",
    "pd_circ['X2_add_Y2'] = pd_circ['X'] * pd_circ['X'] + pd_circ['Y'] * pd_circ['Y']\n",
    "sns.pairplot(pd_circ, hue=\"label\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 基于正态分布的模型，可以用正态分布的特性理解非线性数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd_circ_melt = pd_circ0.melt(id_vars=['label']).pivot_table(index=['variable'], columns=['label'], aggfunc=[np.mean, np.var])\n",
    "pd_circ_melt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![](https://docs.scipy.org/doc/numpy/_images/math/3f40671c78b1cb1d6a6f4a306a2b39a6d55921cf.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mean_X = pd_circ_melt['mean']['value'].loc['X'].values.reshape(2,1)\n",
    "mean_Y = pd_circ_melt['mean']['value'].loc['Y'].values.reshape(2,1)\n",
    "var_X  = pd_circ_melt['var']['value'].loc['X'].values.reshape(2,1)\n",
    "var_Y  = pd_circ_melt['var']['value'].loc['Y'].values.reshape(2,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "probX = 1 / np.sqrt(2*np.pi*var_X) * np.exp(-1*(np.array([pd_circ['X'].values,pd_circ['X'].values]).reshape(2,600)-mean_X)**2 / (2*var_X))\n",
    "probY = 1 / np.sqrt(2*np.pi*var_Y) * np.exp(-1*(np.array([pd_circ['Y'].values,pd_circ['Y'].values]).reshape(2,600)-mean_Y)**2 / (2*var_Y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "pd2 = pd.DataFrame(probX.T*probY.T)\n",
    "pd_circ['pred'] = pd2.apply(lambda x: \"Class2\" if x[0] < x[1] else \"Class1\", axis=1)\n",
    "pd_circ"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd_circ[pd_circ['label']!=pd_circ['pred']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型训练\n",
    "\n",
    "## 交叉熵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "np_yhat = np.linspace(0, 1, 101)\n",
    "np_h = -(0*np.log(np_yhat) + 1*np.log(1-np_yhat))\n",
    "plt.plot(np_yhat, np_h)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 动手写简单的逻辑斯蒂回归"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y = data.target\n",
    "y[y==2] = 1\n",
    "X = np.hstack([data.data, np.ones_like(y).reshape(len(y),1)])\n",
    "C = 1\n",
    "alpha = 0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "omega = np.random.random(X.shape[1]).reshape(5, 1)\n",
    "for i in range(10):\n",
    "    y_hat = 1 / (1+np.exp(-X.dot(omega)))\n",
    "    dL = X.T.dot(C * (y.reshape(-1,1) - y_hat))\n",
    "    omega += alpha * dL \n",
    "    \n",
    "omega"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot( 1 / (1+np.exp(-X.dot(omega))))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 考虑 L2 正则化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "omega = np.random.random(X.shape[1]).reshape(5, 1)\n",
    "for i in range(10):\n",
    "    y_hat = 1 / (1+np.exp(-X.dot(omega)))\n",
    "    dL = X.T.dot(C * (y.reshape(-1,1) - y_hat)) + omega\n",
    "    omega += alpha * dL\n",
    "    \n",
    "omega"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "plt.plot(1 / (1+np.exp(-X.dot(omega))))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用 sklearn 库进行逻辑斯蒂回归"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LogisticRegression(C=1)\n",
    "model.fit(X, y)\n",
    "plt.plot(model.predict(X))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10折交叉验证与网格搜索 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "parameters = {'C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}\n",
    "model = LogisticRegression()\n",
    "clf = GridSearchCV(model, parameters, cv=10)\n",
    "clf.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot( np.log10(np.array([0.0001, 0.001, 0.01, 0.1, 1, 10, 100])), clf.cv_results_['mean_train_score'], label=\"train\")\n",
    "plt.plot( np.log10(np.array([0.0001, 0.001, 0.01, 0.1, 1, 10, 100])), clf.cv_results_['mean_test_score'], label=\"test\")\n",
    "plt.xlabel(\"log10 C\")\n",
    "plt.legend()\n",
    "plt.ylabel(\"Cross Validation Accuracy\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## AUC值用于评价数据分布不均情况下的模型质量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import roc_curve,auc\n",
    "np.random.seed(42)\n",
    "# 假如真实情况1万个病人，有10个是有病的\n",
    "np_real = np.array([0.0  for i in range(9990)] + [1.0 for i in range(10) ], dtype=bool)\n",
    "\n",
    "# 预测1， 全预测为没问题，准确率 99.90%\n",
    "np_pred_allf = 0.1*np.random.random(10000)\n",
    "\n",
    "# 预测2:，准确预测使用情况，准确率 100%\n",
    "np_pred_true = np_pred_allf.copy()\n",
    "np_pred_true[-10:] = 0.99\n",
    "\n",
    "fpr, tpr, thresholds    = roc_curve(np.array(np_real, dtype=int),np_pred_allf )\n",
    "AUC_value = auc(fpr, tpr)\n",
    "\n",
    "fpr2, tpr2, thresholds2 = roc_curve(np.array(np_real, dtype=int),np_pred_true )\n",
    "AUC_value2 = auc(fpr2, tpr2)\n",
    "\n",
    "# 虽然准确率差不多，但是AUC值差异巨大\n",
    "print(AUC_value, AUC_value2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fpr2 = np.array([0] + list(fpr2))\n",
    "tpr2 = np.array([0] + list(tpr2))\n",
    "\n",
    "plt.figure(figsize=(5,5))\n",
    "lw = 2\n",
    "plt.plot(fpr, tpr, color='darkorange',\n",
    "         lw=lw, label='ROC curve 1 (area = %0.2f)' % AUC_value)\n",
    "plt.plot(fpr2, tpr2, color='g',\n",
    "         lw=lw, label='ROC curve 2 (area = %0.2f)' % AUC_value2)\n",
    "\n",
    "plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')\n",
    "\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.title('Receiver operating characteristic example')\n",
    "plt.legend(loc=\"lower right\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
